In [102]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
Out[102]:
In [103]:
%matplotlib inline
In [104]:
import numpy as np
import pandas as pd
import seaborn as sns
import itertools
import matplotlib.pyplot as plt

from bson.objectid import ObjectId

from sgmtradingcore.analytics.metrics import flat_capital_metrics
from stratagemdataprocessing.dbutils.mongo import MongoPersister
In [105]:
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (16, 8)
In [106]:
db = MongoPersister.init_from_config('backtesting', auto_connect=True)
In [107]:
name = 'coint'
desc = 'cochrane_orcutt'
code = 'ft12_fttp.nba'
mnemonic = 'ts.mmp_entryexit_barebones'
trading_id = '562f5bef497aee1c22000001'
config_id = '5a02ec269316de0f57457af3'
In [108]:
print 'Strategy name:', name
print 'Strategy desc:', desc
print 'Strategy code:', code
print 'Mnemonic:     ', mnemonic
Strategy name: coint
Strategy desc: cochrane_orcutt
Strategy code: ft12_fttp.nba
Mnemonic:      ts.mmp_entryexit_barebones
In [109]:
from pprint import pprint
pprint(db['strategy_configurations'].find_one({'_id': ObjectId(config_id)}))
{u'_id': ObjectId('5a02ec269316de0f57457af3'),
 u'params': {u'is_template_strategy': True,
             u'risk_controller': {u'component_name': u'PassAllRiskController',
                                  u'config': {},
                                  u'config_name': u'default'},
             u'scope_provider': {u'component_name': u'BasketballEventTradingScopeProvider',
                                 u'config': {u'competitions': [u'NBA']},
                                 u'config_name': u'nba'},
             u'signals_generators': [{u'component_name': u'CointegrationSignalGenerator',
                                      u'config': {u'interpolation_resolution': 10.0,
                                                  u'lookback_horizon': 1200.0,
                                                  u'pair_type': u'ft12_fttp',
                                                  u'trade_frequency': 60.0},
                                      u'config_name': u'ft12_fttp'}],
             u'sizers': {u'CointegrationTrader': {u'component_name': u'BasicSizer',
                                                  u'config': {u'capital_fraction': 0.01},
                                                  u'config_name': u'fractional'}},
             u'sport': u'basketball',
             u'sport_filter': [6],
             u'strategy_code': u'ft12_fttp.nba',
             u'strategy_desc': u'cochrane_orcutt',
             u'strategy_name': u'coint',
             u'style': u'in-play',
             u'subscription_after_delta': u'06:00:00:000000',
             u'subscription_before_delta': u'01:00:00:000000',
             u'subscription_timedelta': u'06:00:00:001000',
             u'traders': [{u'component_name': u'CointegrationTrader',
                           u'config': {u'max_holding_period': 0.0,
                                       u'min_holding_period': 0.0,
                                       u'odds_bounds': [1.05, 5.0],
                                       u'stop_loss': 0.0,
                                       u'take_profit': 0.0},
                           u'config_name': u'foo'}],
             u'trading_user_id': u'562f5bef497aee1c22000001'},
 u'sha256': u'4cace9d9e1eed7c51ee3fa4b36bc0581c0674172bdaa96baef84937bdece5ec4',
 u'strategy_code': u'ft12_fttp.nba',
 u'strategy_desc': u'cochrane_orcutt',
 u'strategy_name': u'coint',
 u'update_dt': datetime.datetime(2017, 11, 8, 11, 36, 6, 792000, tzinfo=<bson.tz_util.FixedOffset object at 0x7fe8026b8e50>)}
In [110]:
def to_dataframe(orders):
    settled = filter(lambda o: o['status_str'] == 'SETTLED', orders)

    cols = ['placed_time', 'pnl', 'size', 'bet_side', 'price', 'date_day', 'event_id', 'sticker', 'details']
    rcols = {'placed_time': 'dt', 'size': 'stake', 'price': 'odds', 'date_day': 'date'}

    df = pd.DataFrame(settled, columns=cols).rename(columns=rcols)

    df['is_back'] = (df['bet_side'] == 'back')
    df['capital'] = 10000
    df['reason'] = df['details'].apply(lambda d: d.get('reason'))
    df['pair_key'] = df['details'].apply(lambda d: tuple(d['pair_key']))
    df['portfolio_id'] = df['details'].apply(lambda d: d['portfolio_id'])
    df['plausible_returns'] = df['details'].apply(lambda d: d.get('plausible_returns', []))
    df['weight'] = df['details'].apply(lambda d: None if d['trade_intention']['name'] != 'OpenTradeSingleSticker' else d['signals'][0]['value'][d['trade_intention']['sticker']])

    del df['details']

    return df

rows = list(db['strategy_results'].find({
    'strategy_name': name,
    'strategy_desc': desc,
    'strategy_code': code,
#     'trading_user_id': trading_id,
    'mnemonic': mnemonic,
    'config_id': config_id
}))

df = pd.concat([
    to_dataframe(list(db['orders'].find({
        'strategy_result_id': str(r['_id'])
    }))) for r in rows if r['n_orders'] > 0
])

df['month'] = df.dt.dt.month

Metrics:

In [111]:
flat_capital_metrics(df, groupby='month').T
Out[111]:
month 1 2 3 10 11 12
n_trades 17061.000000 11702.000000 6696.000000 2634.000000 17787.000000 16513.000000
n_win 8473.000000 5844.000000 3385.000000 1284.000000 8751.000000 8344.000000
n_loss 8588.000000 5858.000000 3311.000000 1350.000000 9036.000000 8169.000000
hit_ratio 0.496630 0.499402 0.505526 0.487472 0.491989 0.505299
average_trade_win 0.003925 0.003698 0.004075 0.003905 0.003822 0.003894
average_trade_loss 0.004780 0.004569 0.004895 0.004632 0.004597 0.005488
unitary_stake_return -0.082896 -0.091835 -0.042005 -0.099477 -0.115159 -0.093907
cr_trade 0.810053 0.807301 0.851014 0.801877 0.805042 0.724796
cr_day NaN 0.001404 NaN NaN NaN 0.002769
cum_return -7.797946 -5.158165 -2.414704 -1.238800 -8.098781 -12.336877
volatility (not annualised) 0.857300 0.658245 0.482629 0.153837 1.133337 1.700296
sharpe_ratio -9.095936 -7.836234 -5.003233 -8.052656 -7.145962 -7.255722
maximum_drawdown -7.643900 -4.940637 -2.281173 -0.988308 -8.001381 -11.963136
drawdown_duration (days) 30.000000 27.000000 9.000000 5.000000 29.000000 30.000000
maximum_runup 0.000000 -0.002986 0.000000 0.000000 0.000000 -0.003750
runup_duration (days) 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000
total_pnl -77979.462588 -51581.653515 -24147.043517 -12387.995441 -80987.810831 -123368.773677
n_trading_days 31.000000 21.000000 10.000000 6.000000 30.000000 31.000000
In [112]:
flat_capital_metrics(df, groupby='date')['total_pnl'].hist()
plt.title('Distribution of total daily PnL.')
plt.xlabel('PnL [GBp]')
_ = plt.ylabel('Frequency [%]')
In [113]:
flat_capital_metrics(df, groupby='event_id')['cum_return'].hist()
plt.title('Distribution of returns per event.')
plt.xlabel('Return [%]')
_ = plt.ylabel('Frequency [-]')

Portfolio-based analysis:

Let a portfolio, $\Pi_i$, be the set of trades associated with a given signal to take a position on $n > 0$ markets. Each portfolio will typically comprise $2n$ trades (for each market we have one trade to open and one to close), though it need only have 1 actual trade to be valid. We define the following properties:

  • PnL - The total accumulated profit and loss for the portfolio, starting from the opening time, $t_0^{(i)}$, up to the closing time, $T^{(i)}$.
  • Age - The total time (in seconds) that the portfolio was held open in the $n$ markets: $T^{(i)} - t_0^{(i)}$.
  • Reason - The cause for closing the portfolio.
In [114]:
pdf = pd.DataFrame(columns=['portfolio_id', 'pnl', 'age', 'reason']).set_index('portfolio_id')
gps = df.groupby('portfolio_id')

for g in gps.groups:
    gp = gps.get_group(g).sort_values('dt')
    pdf.loc[g] = [gp.pnl.sum(), (gp.iloc[-1]['dt'] - gp.iloc[0]['dt']).total_seconds(), gp.iloc[-1].reason]

pdf['age'] = pdf['age'].astype(np.int)
pdf.loc[pd.isnull(pdf.reason), 'reason'] = 'unclosed'
In [115]:
pdf.describe().T
Out[115]:
count mean std min 25% 50% 75% max
pnl 15795.0 -23.453798 101.209741 -3520.413129 -36.445804 -14.928833 0.009024 242.571501
age 15795.0 970.372650 896.007022 0.000000 363.000000 726.000000 1323.000000 7324.000000
In [116]:
print 'Minimum PnL:'
pdf[pdf.pnl == pdf.pnl.min()]
Minimum PnL:
Out[116]:
pnl age reason
portfolio_id
5a03338b9316de37733e5711 -3520.413129 723 portfolio_conflict
In [117]:
print 'Maxmimum PnL:'
pdf[pdf.pnl == pdf.pnl.max()]
Maxmimum PnL:
Out[117]:
pnl age reason
portfolio_id
5a037e1e9316de37734e4d4e 242.571501 1923 portfolio_conflict
In [118]:
pnl_out_lim = pdf.pnl.quantile(0.005)
In [119]:
axes = pdf[pdf.pnl > pnl_out_lim][['pnl', 'age']].hist()[0]
plt.suptitle('Distribution of portfolio age and PnL (exclusing extremes).')

axes[0].set_xlabel('Age [s]')
axes[0].set_ylabel('Frequency [-]')

axes[1].set_xlabel('PnL [GBp]')
_ = axes[1].set_ylabel('Frequency [-]')
In [120]:
sns.distplot(pdf[pdf.pnl > pnl_out_lim][pdf.age < pdf.age.quantile(0.25)]['pnl'])
sns.distplot(pdf[pdf.pnl > pnl_out_lim][pdf.age > pdf.age.quantile(0.75)]['pnl'])
plt.legend(['Age < q25', 'Age > q75'])

plt.title('Distribution of portfolio PnLs for the upper and lower quantiles on age (exclusing extremes).')
plt.xlabel('PnL [GBp]')
_ = plt.ylabel('Frequency [-]')
/home/tspooner/.venv/st/lib/python2.7/site-packages/ipykernel_launcher.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  """Entry point for launching an IPython kernel.
/home/tspooner/.venv/st/lib/python2.7/site-packages/ipykernel_launcher.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  
In [121]:
df.groupby('reason')['pnl'].describe().T
Out[121]:
reason                   
portfolio_conflict  count    40803.000000
                    mean        -8.882405
                    std         72.012004
                    min      -3119.139655
                    25%        -34.954472
                    50%         -0.273913
                    75%         16.408229
                    max        408.447333
Name: pnl, dtype: float64
In [122]:
f, ax = plt.subplots(1)
gps = pdf[pdf.reason.values != None][pdf.pnl > pnl_out_lim].groupby('reason')

for gid, gp in gps:
    gp.pnl.hist(alpha=0.6, label=gid)

_ = plt.legend()
In [123]:
sns.lmplot("age", "pnl", data=pdf, hue="reason", fit_reg=False, aspect=1.8)
plt.title('Distribution of portfolio PnLs as a function of age.')
plt.xlabel('Age [s]')
_ = plt.ylabel('PnL [GBp]')

Best/worst case analysis:

In [124]:
def get_ret(gp, f=lambda vs: vs[-1]):
    prs = gp.sort_values('dt')['plausible_returns'].values
    lens = map(len, prs)
    ix = np.argmax(lens)

    if any(l > 0 for l in lens):
        return f(prs[ix])
    else:
        return 0.0
In [125]:
get_best_ret = lambda gp: get_ret(gp, lambda vs: max(vs))
get_worst_ret = lambda gp: get_ret(gp, lambda vs: min(vs))
In [126]:
gps = df.groupby('portfolio_id')
actual_rets = gps.apply(get_ret)
best_rets = gps.apply(get_best_ret)
worst_rets = gps.apply(get_worst_ret)
In [127]:
plt.scatter(actual_rets, best_rets-worst_rets)
plt.xlabel('$r_{actual}$ [-]')
plt.ylabel('$r_{best} - r_{worst}$ [-]')
_ = plt.title('Relationship between realised return and the range of possible returns during the portfolios life.', y=1.02)
In [128]:
plt.scatter(best_rets, worst_rets)
plt.xlabel('$r_{best}$ [-]')
plt.ylabel('$r_{worst}$ [-]')
_ = plt.title('Relationship between best and worst returns that could have been realised.', y=1.02)
In [129]:
(best_rets-actual_rets).hist(alpha=0.6)
(best_rets-worst_rets).hist(alpha=0.6)
(actual_rets-worst_rets).hist(alpha=0.6)

plt.xlabel('$r$ [-]')
plt.ylabel('Frequency [-]')
plt.title('Distribution of various differences between best/actual/worst possible returns.', y=1.02)
_ = plt.legend(['$r_{best} - r_{actual}$', '$r_{best} - r_{worst}$', '$r_{actual} - r_{worst}$'], loc='best', prop={'size': 20})

Time series:

In [130]:
def load_df(s):
    from sgmarb.backtesting.data import clean_dataframe
    from stratagemdataprocessing.bookmakers.common.odds.cache import HistoricalOddsCache

    hoc = HistoricalOddsCache(parse=False)
    sdf = clean_dataframe(hoc.get('%s.BF' % s), min_matched=0)
    if sdf is not None:
        sdf['timestamp'] = pd.to_datetime(sdf.timestamp, unit='ms')
        return sdf.set_index('timestamp')
    else:
        return None
In [131]:
def do_plot(pk):
    edf = df[df.pair_key == pk].sort_values('dt')
    stickers = edf['sticker'].unique()

    sdfs = {s: load_df(s) for s in stickers}
    sdfs = {s: df for (s, df) in sdfs.iteritems() if df is not None}

    f, axes = plt.subplots(len(sdfs), sharex=True, figsize=(16, 4*len(sdfs)))
    plt.suptitle(pk)

    for i, (s, sdf) in enumerate(sdfs.iteritems()):
        mp = (sdf['bp1'] + sdf['lp1']) / 2.0
        axes[i].plot(sdf.index, mp, drawstyle='steps-post', color='k', alpha=0.5)
        axes[i].set_title(s)
        axes[i].set_ylim([1.0, min(10.0, mp.max()*1.1)])

        for pid, p in edf[edf.sticker == s].groupby('portfolio_id'):
            open_dt = p['dt'].min()
            close_dt = p['dt'].max()

            if p[p.dt == open_dt].iloc[0].is_back:
                axes[i].axvspan(open_dt, close_dt, color='red', alpha=0.1, label='Back')
            else:
                axes[i].axvspan(open_dt, close_dt, color='blue', alpha=0.1, label='Lay')
In [132]:
for pk in df.pair_key.unique()[:50]:
    do_plot(pk)